{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook is used to scrap all of the texts published in Boletín Bibliográfico and extract relevant metadata.\n",
    "\n",
    "\n",
    "Specifically:\n",
    "    -Part 1: a dataframe of texts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Part 0:\n",
    "\n",
    "import packages\n",
    "\n",
    "open data (\"C:\\Users\\emili\\Desktop\\Tesis Ph.D\\Boletines\\PruebaBolletin\" holds two Bulletins used to test this code) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from io import BytesIO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import PyPDF2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import csv\n",
    "import pandas as pd\n",
    "import requests\n",
    "import re\n",
    "import urllib\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option(\"display.max_columns\",None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "cw= os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/ecr-linux\n"
     ]
    }
   ],
   "source": [
    "cd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total bulletins in dataset: 258\n"
     ]
    }
   ],
   "source": [
    "\n",
    "df_texts = pd.DataFrame(columns=[\"text_title\", \"text_id\", \"authorship\", \"section_title\", \"bulletin_number\",\n",
    "                                 \"date_published\", \"text_link\"])\n",
    "df_authors = pd.DataFrame(columns=[\"name\", \"texts\", \"bio\", \"gender\"])\n",
    "article_counter = 0\n",
    "bulletin_counter = 0\n",
    "main_folder = \"/home/ecr-linux/Desktop/ThesisPh.D/CuratorialDiversityFinalVersions/0.BoletinIndexesHTMLs\"\n",
    "\n",
    "for file_name in os.listdir(main_folder):\n",
    "    if file_name.endswith(\".html\"):\n",
    "        file_path = os.path.join(main_folder, file_name)\n",
    "        with open(file_path, \"r\", encoding=\"utf8\") as f:\n",
    "            bulletin_counter += 1\n",
    "            reader = f.read()\n",
    "            soup = BeautifulSoup(reader, 'html.parser')\n",
    "            bulletin_number= soup.find(\"title\").string[3:19]\n",
    "            date= soup.find(\"div\", {\"class\":\"published\"}).find(\"span\", {\"class\":\"value\"}).string[6:-5]\n",
    "            sections= soup.find_all(\"div\", {\"class\":\"section\"})\n",
    "            \n",
    "            #Loop over the bulletin's sections to get all its texts.\n",
    "            for section in sections:\n",
    "                section_title = section.h2.string[6:-5]\n",
    "                x = section.find_all(\"div\", {\"class\": \"obj_article_summary\"})\n",
    "\n",
    "                for element in x:\n",
    "                    article_counter += 1\n",
    "                    authorship = element.find(\"div\", {\"class\": \"authors\"}).string[4:-3]\n",
    "                    text_title = element.find(\"div\", {\"class\": \"title\"}).get_text()[5:-7]\n",
    "                    text_link = element.find(\"a\")[\"href\"]\n",
    "\n",
    "                    new_row = {\"text_title\": text_title, \"text_id\": article_counter, \"authorship\": authorship,\n",
    "                               \"section_title\": section_title, \"bulletin_number\": bulletin_number,\n",
    "                               \"date_published\": date, \"text_link\": text_link}\n",
    "                    new_entry = pd.DataFrame(data=new_row, index=[0])\n",
    "                    \n",
    "                    df_texts = pd.concat([df_texts, new_entry], ignore_index=True)\n",
    "\n",
    "                    # Introduce a delay of 1 second between requests\n",
    "                    time.sleep(1)\n",
    "print(\"Total bulletins in dataset:\", bulletin_counter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_title</th>\n",
       "      <th>text_id</th>\n",
       "      <th>authorship</th>\n",
       "      <th>section_title</th>\n",
       "      <th>bulletin_number</th>\n",
       "      <th>date_published</th>\n",
       "      <th>text_link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Sin título</td>\n",
       "      <td>1</td>\n",
       "      <td>Ramón Cote Baraibar</td>\n",
       "      <td>Poemas</td>\n",
       "      <td>Vol. 32 Núm. 40</td>\n",
       "      <td>1995-09-15</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Visión en Cartagena</td>\n",
       "      <td>2</td>\n",
       "      <td>Ramón Cote Baraibar</td>\n",
       "      <td>Poemas</td>\n",
       "      <td>Vol. 32 Núm. 40</td>\n",
       "      <td>1995-09-15</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Rafael Gutiérrez Girardot y sus afinidades ele...</td>\n",
       "      <td>3</td>\n",
       "      <td>Rodrigo Zuleta</td>\n",
       "      <td>Artículos</td>\n",
       "      <td>Vol. 32 Núm. 40</td>\n",
       "      <td>1995-09-15</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Nicolás Gómez Dávila : la pasión del anacronismo</td>\n",
       "      <td>4</td>\n",
       "      <td>Óscar Torres Duque</td>\n",
       "      <td>Artículos</td>\n",
       "      <td>Vol. 32 Núm. 40</td>\n",
       "      <td>1995-09-15</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Los pasos perdidos de Ernesto Volkening</td>\n",
       "      <td>5</td>\n",
       "      <td>Juan Guillermo Gómez García</td>\n",
       "      <td>Artículos</td>\n",
       "      <td>Vol. 32 Núm. 40</td>\n",
       "      <td>1995-09-15</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7326</th>\n",
       "      <td>El mundo del libro: abril de 1961</td>\n",
       "      <td>7327</td>\n",
       "      <td>Agustín Rodríguez Garavito</td>\n",
       "      <td>Reseñas</td>\n",
       "      <td>Vol. 4 Núm. 04 (</td>\n",
       "      <td>1961-04-16</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7327</th>\n",
       "      <td>Agradecimientos: abril de 1961</td>\n",
       "      <td>7328</td>\n",
       "      <td>Boletín Cultural y Bibliográfico Banco de la R...</td>\n",
       "      <td>Otras secciones</td>\n",
       "      <td>Vol. 4 Núm. 04 (</td>\n",
       "      <td>1961-04-16</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7328</th>\n",
       "      <td>Movimientos de Libros  en la Sala General de L...</td>\n",
       "      <td>7329</td>\n",
       "      <td>Boletín Cultural y Bibliográfico Banco de la R...</td>\n",
       "      <td>Otras secciones</td>\n",
       "      <td>Vol. 4 Núm. 04 (</td>\n",
       "      <td>1961-04-16</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7329</th>\n",
       "      <td>Misceláneas: abril de 1961</td>\n",
       "      <td>7330</td>\n",
       "      <td>Boletín Cultural y Bibliográfico Banco de la R...</td>\n",
       "      <td>Otras secciones</td>\n",
       "      <td>Vol. 4 Núm. 04 (</td>\n",
       "      <td>1961-04-16</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7330</th>\n",
       "      <td>Últimas adquisiciones: abril de 1961</td>\n",
       "      <td>7331</td>\n",
       "      <td>Boletín Cultural y Bibliográfico Banco de la R...</td>\n",
       "      <td>Otras secciones</td>\n",
       "      <td>Vol. 4 Núm. 04 (</td>\n",
       "      <td>1961-04-16</td>\n",
       "      <td>https://publicaciones.banrepcultural.org/index...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7331 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             text_title text_id  \\\n",
       "0                                            Sin título       1   \n",
       "1                                   Visión en Cartagena       2   \n",
       "2     Rafael Gutiérrez Girardot y sus afinidades ele...       3   \n",
       "3      Nicolás Gómez Dávila : la pasión del anacronismo       4   \n",
       "4               Los pasos perdidos de Ernesto Volkening       5   \n",
       "...                                                 ...     ...   \n",
       "7326                  El mundo del libro: abril de 1961    7327   \n",
       "7327                     Agradecimientos: abril de 1961    7328   \n",
       "7328  Movimientos de Libros  en la Sala General de L...    7329   \n",
       "7329                         Misceláneas: abril de 1961    7330   \n",
       "7330               Últimas adquisiciones: abril de 1961    7331   \n",
       "\n",
       "                                             authorship    section_title  \\\n",
       "0                                   Ramón Cote Baraibar           Poemas   \n",
       "1                                   Ramón Cote Baraibar           Poemas   \n",
       "2                                        Rodrigo Zuleta        Artículos   \n",
       "3                                    Óscar Torres Duque        Artículos   \n",
       "4                           Juan Guillermo Gómez García        Artículos   \n",
       "...                                                 ...              ...   \n",
       "7326                         Agustín Rodríguez Garavito          Reseñas   \n",
       "7327  Boletín Cultural y Bibliográfico Banco de la R...  Otras secciones   \n",
       "7328  Boletín Cultural y Bibliográfico Banco de la R...  Otras secciones   \n",
       "7329  Boletín Cultural y Bibliográfico Banco de la R...  Otras secciones   \n",
       "7330  Boletín Cultural y Bibliográfico Banco de la R...  Otras secciones   \n",
       "\n",
       "       bulletin_number date_published  \\\n",
       "0     Vol. 32 Núm. 40      1995-09-15   \n",
       "1     Vol. 32 Núm. 40      1995-09-15   \n",
       "2     Vol. 32 Núm. 40      1995-09-15   \n",
       "3     Vol. 32 Núm. 40      1995-09-15   \n",
       "4     Vol. 32 Núm. 40      1995-09-15   \n",
       "...                ...            ...   \n",
       "7326  Vol. 4 Núm. 04 (     1961-04-16   \n",
       "7327  Vol. 4 Núm. 04 (     1961-04-16   \n",
       "7328  Vol. 4 Núm. 04 (     1961-04-16   \n",
       "7329  Vol. 4 Núm. 04 (     1961-04-16   \n",
       "7330  Vol. 4 Núm. 04 (     1961-04-16   \n",
       "\n",
       "                                              text_link  \n",
       "0     https://publicaciones.banrepcultural.org/index...  \n",
       "1     https://publicaciones.banrepcultural.org/index...  \n",
       "2     https://publicaciones.banrepcultural.org/index...  \n",
       "3     https://publicaciones.banrepcultural.org/index...  \n",
       "4     https://publicaciones.banrepcultural.org/index...  \n",
       "...                                                 ...  \n",
       "7326  https://publicaciones.banrepcultural.org/index...  \n",
       "7327  https://publicaciones.banrepcultural.org/index...  \n",
       "7328  https://publicaciones.banrepcultural.org/index...  \n",
       "7329  https://publicaciones.banrepcultural.org/index...  \n",
       "7330  https://publicaciones.banrepcultural.org/index...  \n",
       "\n",
       "[7331 rows x 7 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Create the folder if it doesn't exist\n",
    "folder_path = '/home/ecr-linux/Desktop/ThesisPh.D/CuratorialDiversityFinalVersions/1.BoletinTextsHTMLsmetadata'\n",
    "os.makedirs(folder_path, exist_ok=True)\n",
    "\n",
    "# Function to extract information from HTML\n",
    "def extract_info(html_content):\n",
    "    soup = BeautifulSoup(html_content, 'html.parser')\n",
    "    \n",
    "    pdf_link = soup.find(\"meta\", {\"name\": \"citation_pdf_url\"})\n",
    "    summary = soup.find(\"div\", {\"class\": \"item abstract sheet active\"})\n",
    "    keywords = soup.find(\"div\", {\"class\": \"item keywords\"})\n",
    "    \n",
    "    pdf_link = pdf_link['content'] if pdf_link else None\n",
    "    summary = summary.text.strip() if summary else None\n",
    "    keywords = keywords.text.strip() if keywords else None\n",
    "    \n",
    "    return pdf_link, summary, keywords\n",
    "\n",
    "# Function to download HTML, extract information, and update DataFrame\n",
    "def process_row(row):\n",
    "    url = row[\"text_link\"]\n",
    "    section = row['section_title']\n",
    "    index = row.name\n",
    "\n",
    "    try:\n",
    "        response = requests.get(url)\n",
    "        response.raise_for_status()  # Raise an HTTPError for bad responses\n",
    "        \n",
    "        if response.status_code == 200:\n",
    "            html_content = response.text\n",
    "\n",
    "            pdf_link, summary, keywords = extract_info(html_content)\n",
    "\n",
    "            # Update DataFrame with new columns\n",
    "            df.at[index, 'pdf_link'] = pdf_link\n",
    "            df.at[index, 'summary'] = summary\n",
    "            df.at[index, 'keywords'] = keywords\n",
    "\n",
    "            # Save HTML file in the specified folder\n",
    "            filename = os.path.join(folder_path, f\"{section[:3]}_{index}.html\")\n",
    "            with open(filename, 'w', encoding='utf-8') as file:\n",
    "                file.write(html_content)\n",
    "\n",
    "            # Introduce a delay of 1 second after processing each row\n",
    "            time.sleep(1)\n",
    "\n",
    "        else:\n",
    "            print(f\"Failed to download HTML for index {index}. HTTP Status Code: {response.status_code}\")\n",
    "\n",
    "    except requests.RequestException as e:\n",
    "        print(f\"Failed to access URL for index {index}. Exception: {e}\")\n",
    "\n",
    "# Apply the process_row function to each row in the DataFrame\n",
    "df_texts.apply(process_row, axis=1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
